Mouse and Human cell recovery

A 10x genomics scRNA-Seq library was constructed from a mix of 293T and NIH3T3 cells. A mouse and a human cell barcode was selected for pulldown with a biotinylated DNA oligo with LNA bases added every third nucleotide.

Following reamplification the 2 cell libraries were pooled and resequenced together. The raw fastqs were then processed using a Snakemake pipeline, to produce two processed data files:

  1. A matrix with UMIs per cell (column) per gene (rows) (dge_matrix.txt)
  2. A flatfile with per UMI information (umigroups.txt.gz)

This RMarkdown document will produce the following figures:

  1. …
  2. …
  3. …

Organize single cell libraries

First designate the libraries and the cells that were resampled.

cells <- list(
  mouse_human_cell_pulldown = c("GACGTTAGTGCCTGTG",
                                "CTGATCCCATGACGGA"))

libs <- c(
  "original_10x",
  "mouse_human_cell_pulldown")

bc_metadat <- read_tsv(file.path(data_dir, 
                         "lna_control", 
                         "fastq",
                         "original", 
                         "barcodes_from_10x_run.txt"),
                         col_names = c("cell_id", "barcode_10x")) 

## original library to compare against
reflib <- "original_10x"
resampled_libs <- c("mouse_human_cell_pulldown")

## reference resampled lib for resampled vs control plots
resampled_lib <- "mouse_human_cell_pulldown"

## pretty name for libraries
lib_names = c(
  original_10x = "Original Library",
  mouse_human_cell_pulldown = "Resampled Library"
)

## pretty names for cells
cell_names = c(
  "GACGTTAGTGCCTGTG-1" = "Mouse Cell",
  "CTGATCCCATGACGGA-1" = "Human Cell")

Load and organize a table for each library of read counts per cell per gene, and a table of umi counts per cell per gene.

umis_to_genes <- function(umipath, cells_to_exclude = c("Cell_unmatched")){
  umis <- read_tsv(umipath,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
    filter(barcode_10x != cells_to_exclude)
  
  mol_fields <- str_count(umis$umi_molecule[1], "::")
  
  if(mol_fields == 2 ){
    umis <- separate(umis, umi_molecule, 
                     into = c("seq", "genome", "gene"),
                     sep = "::") %>% 
      mutate(gene = str_c(genome, "::", gene))
  } else if (mol_fields == 1){
    umis <- separate(umis, umi_molecule, 
                     into = c("seq", "gene"),
                     sep = "::")
  } else {
    stop("separator :: missing from umi_molecule field")
  }
  
  reads <- select(umis, 
                  barcode_10x, 
                  gene,
                  count)
  
  reads <- group_by(reads, 
                   barcode_10x, gene) %>% 
    summarize(counts = sum(count))
  
  reads <- spread(reads, barcode_10x, counts, 
                  fill = 0L)
  
  reads
}

## read in umigroups flat file with read counts per umi per gene per cell
## expand out to a read count matrix
umipaths <- file.path(data_dir, 
                      libs, 
                      "umis",
                      "umigroups.txt.gz")
read_dat <- map(umipaths, 
                ~umis_to_genes(.))
names(read_dat) <- libs

## read in umi_tools count table with umi counts per gene per cell
## drop rows with 0 counts
umi_dat <- map(libs, 
                ~read_tsv(file.path(data_dir, 
                          .x,
                          "dgematrix",
                          "dge_matrix.txt")) %>% 
                 select(-matches("Cell_unmatched")) %>% 
                 .[rowSums(.[, -1]) > 0, ])

names(umi_dat) <- libs

# add in cell info, including info for the original sample
cell_obj_mdata <- map(c(cells[1], cells), 
                      ~mutate(bc_metadat, 
                              resampled = ifelse(barcode_10x %in% .x,
                                                  TRUE,
                                                  FALSE)))
names(cell_obj_mdata) <- libs

Next organize these tables into simple classes called resampled-sets to keep track of each experiment’s relavant raw, processed, and meta data.

#' simple class to hold info for each experiment
create_sc_obj <- function(umi_df,
                          read_df,
                          cell_mdata_df){
  x <- list()
  class(x) <- "resampled-set"
  x$umis <- umi_df
  x$reads <- read_df
  x$meta_dat <- cell_mdata_df
  return(x)
}

sc_objs <- list(umi_dat, read_dat, cell_obj_mdata)
sc_objs <- pmap(sc_objs, create_sc_obj)

rm(umi_dat)
rm(read_dat)

Next perform basic processing. 1) generate separate objects to store sparse matrices of umi and read counts. 2) normalize read and umi count data by total library size (sum of all read or umi counts for all cells in the experiment) and report as Reads per million or UMIs per million. 3) Compute per cell metrics (read and umi counts, sequencing saturation)

tidy_to_matrix <- function(df){
   df <- as.data.frame(df)
   rownames(df) <- df[, 1]
   df[, 1] <- NULL
   mat <- as.matrix(df)
   mat <- as(mat, "sparseMatrix")   
   return(mat)
}

#' keep both tidy and matrix objs
generate_matrices <- function(sc_obj){
  sc_obj$umi_matrix <- tidy_to_matrix(sc_obj$umis)
  sc_obj$read_matrix <- tidy_to_matrix(sc_obj$reads)
  sc_obj
}

#' normalize by library size (Reads per Million)
norm_libsize <- function(sc_obj){
  sc_obj$norm_umi <- 1e6 * sweep(sc_obj$umi_matrix, 2, 
                                 sum(as.vector(sc_obj$umi_matrix)), "/")
  sc_obj$norm_reads <- 1e6 * sweep(sc_obj$read_matrix, 2, 
                                   sum(as.vector(sc_obj$read_matrix)), "/")
  sc_obj
}

add_metadata <- function(sc_obj, dat){
  if (is.vector(dat)){
    new_colname <- deparse(substitute(dat))
    df <- data_frame(!!new_colname := dat)
    df[[new_colname]] <- dat
    df[["cell_id"]] <- names(dat)
    sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
                                 df,
                                 by = "cell_id")
    
  } else if (is.data.frame(dat)) {
    sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
                                 dat,
                                 by = "cell_id")
  }
  sc_obj
}

compute_summaries <- function(sc_obj){
  ## raw counts
  total_umis <- colSums(sc_obj$umi_matrix)
  names(total_umis) <- colnames(sc_obj$umi_matrix)
  total_reads <- colSums(sc_obj$read_matrix)
  names(total_reads) <- colnames(sc_obj$read_matrix)
  
  ## norm counts
  norm_total_umis <- colSums(sc_obj$norm_umi)
  names(norm_total_umis) <- colnames(sc_obj$norm_umi)
  norm_total_reads <- colSums(sc_obj$norm_reads)
  names(norm_total_reads) <- colnames(sc_obj$norm_reads)
    
  sc_obj <- add_metadata(sc_obj, total_umis)
  sc_obj <- add_metadata(sc_obj, total_reads)
  sc_obj <- add_metadata(sc_obj, norm_total_umis)
  sc_obj <- add_metadata(sc_obj, norm_total_reads)
  
  ## compute cDNA duplication rate 
  sc_obj$meta_dat$cDNA_duplication <- 1 - (sc_obj$meta_dat$total_umis /
                                             sc_obj$meta_dat$total_reads)
  
  sc_obj
}

sc_objs <- map(sc_objs, generate_matrices)
sc_objs <- map(sc_objs, norm_libsize)
sc_objs <- map(sc_objs, compute_summaries)

Compute enrichment of reads/umis over the original library.

sc_objs <- map(sc_objs,
    function(sub_dat){
      og_counts <- select(sc_objs[[reflib]]$meta_dat,
                          og_total_reads = total_reads,
                          og_total_umis = total_umis,
                          og_norm_total_umis = norm_total_umis,
                          og_norm_total_reads = norm_total_reads,
                          og_cDNA_duplication = cDNA_duplication,
                          cell_id)
      sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
                         og_counts, 
                         by = "cell_id")
      
      sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
                                 read_proportion = log2( total_reads / og_total_reads),
                                 umi_proportion = log2( total_umis / og_total_umis),
                                 norm_read_proportion = log2( norm_total_reads /
                                                                og_norm_total_reads),
                                 norm_umi_proportion = log2( norm_total_umis /
                                                               og_norm_total_umis))
      sub_dat
    })

plot cDNA duplication rate

sc_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
  mutate(library = factor(library, levels = libs)) %>% 
  arrange(resampled)

plt <- ggplot(sc_metadat, aes(total_umis, cDNA_duplication)) +
  geom_point(aes(color = resampled), size = 0.5) +
  labs(x = expression(paste("# of UMIs")),
       y = "Sequencing saturation") + 
  scale_x_log10(labels = scales::comma) +
  scale_color_manual(values = color_palette) +
  facet_wrap(~library,
             labeller = labeller(library = lib_names)) +
  theme_cowplot(font_size = 16, line_size = 1)  +
  theme(legend.position = "top",
        plot.margin = unit(c(5.5, 20.5, 5.5, 5.5), 
                           "points")) 

plt
Note the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.

Note the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.

save_plot("cDNA_duplication.pdf", plt, 
          base_aspect_ratio = 1.6)

plot read and umi counts per library

global_plot_theme <- theme(
        legend.position = "top",
        legend.text = element_text(size = 10),
        strip.text = element_text(size = 8))

resampled_metadat <- filter(sc_metadat,
                             library != reflib) %>% 
  mutate(library = factor(library, 
                          levels = resampled_libs))

unnorm_plt <- ggplot(resampled_metadat, 
                     aes(og_total_reads / 3, total_reads / 3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) +
  facet_wrap(~library, nrow = 1) + 
  coord_fixed() +
  xlab("original library\nreads count (Thousands)") +
  ylab("resampled library\nreads count (Thousands)") +
 # ggtitle("Raw reads associated with each cell barcode") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = .5) +
  global_plot_theme

norm_plt <- ggplot(resampled_metadat, aes(og_norm_total_reads / 1e3, norm_total_reads / 1e3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) + 
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nRPM (Thousands)") +
  ylab("resampled library\nRPM (Thousands)") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = 0.5) +
  theme(aspect.ratio = 1) + 
  global_plot_theme

unnorm_umi_plt <- ggplot(resampled_metadat, 
                         aes(og_total_umis / 1e3, total_umis / 1e3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) +
  coord_fixed() +
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nUMI count (Thousands)") +
  ylab("resampled library\nUMI count (Thousands)") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = .5) +
  global_plot_theme

norm_umi_plt <- ggplot(resampled_metadat, 
                       aes(og_norm_total_umis / 1e3, norm_total_umis / 1e3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) + 
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nUMI normalized RPM (Thousands)") +
  ylab("resampled library\nUMIs per Million (Thousands)") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = 0.5) +
  theme(aspect.ratio = 1) + 
  global_plot_theme

plt <- plot_grid(unnorm_plt, norm_plt, unnorm_umi_plt, norm_umi_plt, 
                 labels = "AUTO",
                 align = 'hv')
plt

save_plot("reads_per_barcode_scatterplots.pdf", plt, base_width = 8 )

Plot enrichment of reads/umis

read <- ggplot(resampled_metadat, 
       aes(cell_id, norm_read_proportion, colour = resampled)) + 
  geom_point() + 
  labs(x = "Cell", 
       y = expression(paste( " Log"[2], " normalized reads ", frac("resampled", "original")))) +
  scale_colour_manual(name = "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(axis.text.x = element_blank(),
        legend.position = "top",
        legend.text = element_text(size = 12))

umi <- ggplot(resampled_metadat, 
       aes(cell_id, norm_umi_proportion, colour = resampled)) + 
  geom_point() + 
  labs(x = "Cell", 
       y = expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
  scale_colour_manual(name = "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(axis.text.x = element_blank(),
        legend.position = "top",
        legend.text = element_text(size = 12))

plt <- plot_grid(read, umi,
                 labels = "AUTO",
                 align = 'hv')
plt

ggsave("reads_umi_ratio_per_barcode_normalized.pdf", width = 8, height = 5)



umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
  function(x){
    ggplot(x, 
       aes(og_total_umis, norm_umi_proportion, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_hline(aes(yintercept = 0), 
             linetype ="dashed", 
             color = "darkgrey") + 
  labs(x = "Abundance in original library\n (UMIs)", 
       y = expression(paste( "Log"[2], " UMIs ", 
                             frac("resampled", "original")))) +
  scale_x_continuous(labels = scales::comma) +      
  scale_colour_manual(name = "resampled:", values = color_palette) +
 # facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(legend.position = "top",
        legend.text = element_text(size = 12))})

plt <- plot_grid(plotlist = umi_plots, nrow = 1)
save_plot("umi_ratio_MA.pdf", plt)
ggplot(resampled_metadat, aes(resampled, 
                read_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " Reads ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                umi_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " UMIs ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                norm_read_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " normalized Reads ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("norm_reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                norm_umi_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("norm_umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
dat <- group_by(resampled_metadat, library) %>%
  filter(library != reflib) %>% 
  mutate(total_new = sum(total_reads, na.rm = T), 
         total_old = sum(og_total_reads, na.rm = T))

dat_group <- group_by(dat, library, resampled) %>% 
  summarize(total_new = sum(total_reads, 
                            na.rm = T) / unique(total_new), 
            total_old = sum(og_total_reads, 
                            na.rm = T) / unique(total_old)) %>% 
  gather(lib, percent_lib, -library, -resampled ) %>%
  mutate(lib = factor(lib, levels = c("total_old", "total_new"), 
                      labels = c("original\nlibrary", "resampled\nlibrary")))

ggplot(dat_group, aes(lib, percent_lib, fill = resampled)) + 
  geom_bar(stat = "identity") + 
  ylab("Fraction of\n Reads Assigned") +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  facet_wrap(~library) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
    legend.position = "top",
    legend.text = element_text(size = 16)
  )

ggsave("proportion_reads_all_barcode_barplot.pdf", width = 7, height = 7)

dat_group %>% 
 rename(Method = library) %>% 
  spread(lib, percent_lib) %>% 
  mutate(`Targeted Library Read Fold-Enrichment` = `resampled\nlibrary` / `original\nlibrary`) %>%
  filter(resampled == T) %>% 
  select(Method, `Targeted Library Read Fold-Enrichment`)

Compare species

add_species_counts <- function(sc_obj, 
                               mouse_gene_prefix = "mm38::",
                               human_gene_prefix = "hg38::"){
  
  ## get mouse and human reads
  g_ids <- rownames(sc_obj$read_matrix)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  mouse_reads = colSums(sc_obj$read_matrix[mouse_ids, ])
  human_reads = colSums(sc_obj$read_matrix[human_ids, ])
  
  ## get mouse and human UMIs
  g_ids <- rownames(sc_obj$umi_matrix)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  mouse_umis = colSums(sc_obj$umi_matrix[mouse_ids, ])
  human_umis = colSums(sc_obj$umi_matrix[human_ids, ])
  
  ## get norm counts for reads 
  g_ids <- rownames(sc_obj$norm_reads)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  norm_human_reads <- colSums(sc_obj$norm_reads[human_ids, ])
  norm_mouse_reads <- colSums(sc_obj$norm_reads[mouse_ids, ])
  
  ## get norm counts for umis
  g_ids <- rownames(sc_obj$norm_umi)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  norm_human_umis <- colSums(sc_obj$norm_umi[human_ids, ])
  norm_mouse_umis <- colSums(sc_obj$norm_umi[mouse_ids, ])
  
  sc_obj <- add_metadata(sc_obj, human_reads)
  sc_obj <- add_metadata(sc_obj, mouse_reads)
  sc_obj <- add_metadata(sc_obj, human_umis)
  sc_obj <- add_metadata(sc_obj, mouse_umis)
  sc_obj <- add_metadata(sc_obj, norm_human_reads)
  sc_obj <- add_metadata(sc_obj, norm_mouse_reads)
  sc_obj <- add_metadata(sc_obj, norm_human_umis)
  sc_obj <- add_metadata(sc_obj, norm_mouse_umis)
  
  ## make sure mouse + human == total
  stopifnot(all(sc_obj$meta_dat$total_reads == sc_obj$meta_dat$mouse_reads + 
                                               sc_obj$meta_dat$human_reads, na.rm = T))
  
  stopifnot(all(sc_obj$meta_dat$total_umis == sc_obj$meta_dat$mouse_umis + 
                                               sc_obj$meta_dat$human_umis, na.rm = T))
  ## check floating point totals
  tol <- 1e-5
  reads_check <- all(abs(sc_obj$meta_dat$norm_total_reads - (sc_obj$meta_dat$norm_mouse_reads + 
                                               sc_obj$meta_dat$norm_human_reads)) <= tol, na.rm = T)
  stopifnot(reads_check)
  
  umis_check <- all(abs(sc_obj$meta_dat$norm_total_umis - (sc_obj$meta_dat$norm_mouse_umis + 
                                               sc_obj$meta_dat$norm_human_umis)) <= tol, na.rm = T)
  stopifnot(umis_check)
  ## calculate species purity (human / human + mouse)
  sc_obj$meta_dat <- mutate(sc_obj$meta_dat, 
                            purity_reads = human_reads / (human_reads + mouse_reads),
                            purity_umis = human_umis / (human_umis + mouse_umis))
  sc_obj
}

sc_objs <- map(sc_objs, add_species_counts)
## add in metadat columns for original mouse and original human data
sc_objs <- map(sc_objs,
    function(sub_dat){
      og_dat <- select(sc_objs[[reflib]]$meta_dat,
                          cell_id,
                          str_subset(colnames(sc_objs[[reflib]]$meta_dat),
                                     "human|mouse|purity"))
                          
      cols <- colnames(og_dat)
      new_cols <- c("cell_id", str_c("og_", cols[2:length(cols)]))
      colnames(og_dat) <- new_cols
      
      sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
                         og_dat, 
                         by = "cell_id")
      
      sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
                                 norm_human_read_proportion = log2( norm_human_reads /
                                                                og_norm_human_reads),
                                 norm_human_umi_proportion = log2( norm_human_umis /
                                                               og_norm_human_umis),
                                 norm_mouse_read_proportion = log2( norm_mouse_reads /
                                                                og_norm_mouse_reads),
                                 norm_mouse_umi_proportion = log2( norm_mouse_umis /
                                                               og_norm_mouse_umis))
      
      sub_dat
    })


resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
  mutate(library = factor(library, 
                          levels = libs)) %>% 
  arrange(resampled)
read_plots <- map(split(resampled_metadat, resampled_metadat$library),
  function(x){
  ggplot(x, 
         aes(human_reads / 1e3, mouse_reads / 1e3, 
                               colour = resampled)) +
  geom_point(size = 0.5) + 
  facet_wrap(~library, nrow = 1, 
             scales = "free", 
             labeller = labeller(library = lib_names)) + 
  xlab("Human reads (Thousands)") +
  ylab("Mouse reads (Thousands)") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(legend.position = "top",
        legend.text = element_text(size = 12))
  })
plt <- plot_grid(plotlist = read_plots,
                 nrow = 1)
save_plot("read_scatterplot_human_mouse.pdf", 
          plt, ncol = 3, nrow = 1,
          base_width = 4, base_height = 4)

umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
  function(x){
  ggplot(x,
       aes(human_umis, 
           mouse_umis, 
           colour = resampled)) +
  geom_point(size = 0.5) + 
  facet_wrap(~library, nrow = 1, 
             scales = "free",
             labeller = labeller(library = lib_names)) + 
  xlab("Human UMIs") +
  ylab("Mouse UMIs") +
  scale_x_continuous(labels = scales::comma) + 
  scale_y_continuous(labels = scales::comma) +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(legend.position = "top",
        legend.text = element_text(size = 12))
  })
plt <- plot_grid(plotlist = umi_plots,
                 nrow = 1)
save_plot("umi_scatterplot_human_mouse.pdf", 
          plt, ncol = 2, nrow = 1,
          base_width = 4, base_height = 4)

Genes detected

## compute per gene or per gene/umi combo enrichment
detected_molecules <- function(sc_obj, molecule = "gene"){
  umis <- sc_obj$umi_matrix
  if (molecule == "gene"){
    human_mat <- umis[str_detect(rownames(umis), "hg38::"), ]
    mouse_mat <- umis[str_detect(rownames(umis), "mm38::"), ]
    n_human_genes <- colSums(human_mat > 0)
    n_mouse_genes <- colSums(mouse_mat > 0)
    out_mdat <- data_frame(cell_id = colnames(umis),
      n_human_genes = n_human_genes,
      n_mouse_genes = n_mouse_genes)
    sc_obj <- add_metadata(sc_obj, out_mdat)
    }
}
sc_objs <- map(sc_objs, ~detected_molecules(.x))
resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = libs))

og_genes <- filter(resampled_metadat, 
                   library == reflib) %>% 
  dplyr::select(cell_id, 
                og_human_genes = n_human_genes, 
                og_mouse_genes = n_mouse_genes)

resampled_metadat <- left_join(resampled_metadat, 
                                og_genes,
                                by = "cell_id")

ggplot(resampled_metadat, aes(n_human_genes, 
                               og_human_genes, colour = resampled)) + 
  geom_point() + 
  ylab("resampled_genes") +
  xlab("original_genes") + 
  scale_colour_manual(name =  "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(
    legend.position = "top",
    legend.text = element_text(size = 16)
  )

ggplot(resampled_metadat, aes(n_mouse_genes, 
                               og_mouse_genes, colour = resampled)) + 
  geom_point() + 
  ylab("resampled_genes") +
  xlab("original_genes") + 
  scale_colour_manual(name =  "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(
    legend.position = "top",
    legend.text = element_text(size = 16)
  )

Parse out new versus previously identified genes

calc_gene_sensitivity <- function(sc_obj, 
                                  type = "umi",
                                  mouse_gene_prefix = "mm38::",
                                  human_gene_prefix = "hg38::"){
  
  if (type == "umi"){
    count_matrix <- sc_obj$umi_matrix
  } else {
    count_matrix <- sc_obj$read_matrix
  }
  # generate list named with barcode of each detected gene and 
  # respective read/umi count
  genes_detected <- apply(count_matrix, 2, function(x) x[x > 0])
  sc_obj$genes_detected <- genes_detected
  sc_obj
}

sc_objs <- map(sc_objs, calc_gene_sensitivity)
sc_objs <- map(sc_objs, 
           function(x){
             og_genes <- sc_objs[[reflib]]$genes_detected
             sub_genes <- x$genes_detected
             
             # subset list of cell barcodes to the same as the og experiment
             # and also reorders the barcodes to match
             sub_genes <- sub_genes[names(og_genes)]
             
             if(length(sub_genes) != length(og_genes)){
               stop("barcode lengths not the same")
             }
             shared_genes <- map2(sub_genes, 
                                  og_genes,
                                  ~intersect(names(.x),
                                             names(.y)))
             new_genes <- map2(sub_genes,
                               og_genes,
                               ~setdiff(names(.x),
                                        names(.y)))
             
             not_recovered_genes <- map2(og_genes,
                                         sub_genes,
                                         ~setdiff(names(.x),
                                                  names(.y)))
             x$shared_genes <- shared_genes
             x$new_genes <- new_genes
             x$not_recovered_genes <- not_recovered_genes
             return(x)
             })

## add gene recovery info to meta data table
sc_objs <- map(sc_objs, 
           function(x){
             shared_genes <- map2_dfr(x$shared_genes, 
                                names(x$shared_genes),
                                function(x, y){
                                  mouse <- sum(str_detect(x, "^mm38::")) ;
                                  human <- sum(str_detect(x, "^hg38::")) ;
                                  data_frame(cell_id = y,
                                            mouse_shared_genes = mouse,
                                            human_shared_genes = human,
                                            shared_genes = mouse + human)
                                 })
             
             not_recovered_genes <- map2_dfr(x$not_recovered_genes, 
                                names(x$not_recovered_genes),
                                function(x, y){
                                  mouse <- sum(str_detect(x, "^mm38::")) ;
                                  human <- sum(str_detect(x, "^hg38::")) ;
                                  data_frame(cell_id = y,
                                            mouse_not_recovered_genes = mouse,
                                            human_not_recovered_genes = human,
                                            not_recovered_genes = mouse + human)
                                 })
             
             new_genes <- map2_dfr(x$new_genes, 
                                names(x$new_genes),
                                function(x, y){
                                  mouse <- sum(str_detect(x, "^mm38::")) ;
                                  human <- sum(str_detect(x, "^hg38::")) ;
                                  data_frame(cell_id = y,
                                            mouse_new_genes = mouse,
                                            human_new_genes = human,
                                            new_genes = mouse + human)
                                 })
             gene_mdata <- left_join(shared_genes,
                                     not_recovered_genes,
                                     by = "cell_id") %>% 
               left_join(., new_genes, by = "cell_id")
             
             x <- add_metadata(x, gene_mdata)
             x
           })

resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = libs)) %>% 
  arrange(resampled)
genes_recovered <- resampled_metadat %>% 
  dplyr::filter(library != reflib) %>% 
  dplyr::select(cell_id, 
                library,
                resampled,
                shared_genes,
                not_recovered_genes, 
                new_genes)

genes_recovered <- gather(genes_recovered, 
                          key = type, value = count, 
                          -cell_id, -resampled, -library)
genes_recovered <- mutate(genes_recovered,
                          type = str_replace_all(type, "_", "\n"))

plt <- ggplot(genes_recovered, 
       aes(cell_id, count)) +
  geom_point(aes(color = resampled),
             size = 0.6,
             alpha = 0.75) +
  facet_grid(type ~ library) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        strip.text.y = element_text(size = 12,
                                    margin = margin(0,0.2,0,0.2, "cm"))) +
  scale_color_manual(values = color_palette)

plt 

save_plot("new_genes_detected.pdf", plt, base_width = 8, base_height = 8)

targeted <- genes_recovered %>% 
  dplyr::filter(library == resampled_lib,
                resampled) 
targeted
plt_dat <- genes_recovered %>% 
  dplyr::filter(library != resampled_lib,
                resampled) %>% 
  group_by(type) %>% 
  summarize(count = mean(count)) %>% 
  mutate(cell_id = "Not Targeted Barcodes") %>% 
  bind_rows(targeted, .) %>% 
  filter(type == "new\ngenes")


plt <- ggplot(plt_dat, 
       aes(cell_id, count)) +
  geom_bar(aes(fill = cell_id),
           stat = "identity") +
  labs(y = "Newly detected genes") + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "none") +
  scale_fill_brewer(palette = "Set1")

save_plot("new_genes_barplot.pdf", plt, 
          base_width = 4.25, base_height = 5)

## species specific

genes_recovered <- resampled_metadat %>% 
     dplyr::filter(library %in% resampled_libs) %>% 
     select(cell_id, resampled, purity_reads, ends_with("genes")) %>% 
  mutate(species = ifelse(purity_reads > 0.80,
                          "human",
                          ifelse(purity_reads < 0.20, 
                                 "mouse",
                                 "mixed"))) %>% 
  filter(species != "mixed") %>% 
  mutate(shared_genes_species = ifelse(species == "human",
                                          human_shared_genes,
                                          mouse_shared_genes),
         new_genes_species = ifelse(species == "human",
                                    human_new_genes,
                                    mouse_new_genes),
         not_recovered_genes_species = ifelse(species == "human",
                                              human_not_recovered_genes,
                                              mouse_not_recovered_genes)) %>% 
  select(cell_id, resampled, ends_with("_species")) %>% 
  gather(key = type, value = count, 
         -cell_id, -resampled) %>% 
   mutate(type = str_replace(type, "_species", "") %>% 
            str_replace_all(., "_", "\n"))

targeted <- genes_recovered %>% 
  dplyr::filter(resampled) 

plt_dat <- genes_recovered %>% 
  dplyr::filter(!resampled) %>% 
  group_by(type) %>% 
  summarize(count = mean(count)) %>% 
  mutate(cell_id = "Not targeted\nbarcodes\n(mean)") %>% 
  bind_rows(targeted, .) %>% 
  mutate(type = ifelse(type == "new\ngenes",
                        "Newly\ndetected\ngenes",
                        ifelse(type == "shared\ngenes",
                               "Previously\ndetected\ngenes",
                               ifelse(type == "not\nrecovered\ngenes",
                                      "Previously\ndetected\ngenes\nnot recovered",
                                      NA))))


plt <- ggplot(plt_dat, 
       aes(cell_id, count)) +
  geom_bar(aes(fill = type),
           stat = "identity") +
  labs(y = "# of genes") + 
  scale_x_discrete(labels = cell_names) + 
  scale_y_continuous(labels = scales::comma) + 
  scale_fill_brewer(palette = "Set1", name = "") +
  guides(fill = guide_legend(override.aes = list(size = 0.25))) +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "top",
        plot.margin = unit(c(5.5, 50.5, 5.5, 5.5), 
                           "points"))
      #  legend.key.size = unit(0.25, "pt")) 

save_plot("new_genes_species_specific_barplot.pdf", plt, 
          base_width = 4, base_height = 5) 

Plot expression per cell

MA plots

calc_ma <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  x_rn <- rownames(xmat)
  y_rn <- rownames(ymat)
  xmat <- log2(xmat + 1)
  ymat <- log2(ymat + 1)
  
  rownames(xmat) <- x_rn
  rownames(ymat) <- y_rn
  m <- rowMeans(log2(((2^ymat + 2^xmat) / 2) + 1))
  a <- xmat[, cell] - ymat[, cell]
  data_frame(gene = names(a),
             mean_expression_log2 = m,
             log2_diff = a)
}


genes_to_plot <- rownames(sc_objs[[resampled_lib]]$umi_matrix)
cols <- colnames(sc_objs[[resampled_lib]]$norm_umi)
cell_ids <- str_c(cells[[resampled_lib]], "-1")

## append genes to reference library if necessary
ref_mat <- standardize_rows(sc_objs[[resampled_lib]]$umi_matrix[, cols],
                            sc_objs[[reflib]]$umi_matrix[, cols])

ma_dat <- map(cell_ids,
    ~calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot, cols], 
             ref_mat[genes_to_plot, cols],
             cell = .x))

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")

plot_ma <- function(df){
  n_up <- filter(df, log2_diff > 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("up = ", n))
  
  n_down <- filter(df, log2_diff < 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("down = ", n))
  
  if (nrow(n_down) == 0) {
    n_down = data_frame(cell = df$cell %>% unique(),
                        n = "down = 0")
  }
  plt <- ggplot(df,
         aes(mean_expression_log2,
             log2_diff)) +
    geom_hline(aes(yintercept = 0), linetype = "dashed", colour = "grey") + 
    geom_point(size = 0.25) +
    geom_text(data = n_up, aes(x = max(ma_dat$mean_expression_log2) * 0.9, 
                               y = max(ma_dat$log2_diff) * 1.2, 
                               label = n)) +
    geom_text(data = n_down, aes(x = max(ma_dat$mean_expression_log2) * 0.9, 
                                 y = min(ma_dat$log2_diff) * 1.2, 
                                 label = n)) + 
    facet_wrap(~cell) +
    labs(x = expression(paste("Abundance (log"[2], ")")),
         y = expression(paste(frac("resampled","Original"), " (log"[2], ")")))
  plt
}

plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_all_genes.pdf", plt, base_height = 6)

## Shared genes
ma_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_shared_genes.pdf", plt, base_height = 6)


## New genes
ma_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_new_genes.pdf", plt, base_height = 6)

Histograms

get_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  xrows <- rownames(xmat)
  xmat <- log2(xmat[, cell] + 1)
  ymat <- log2(ymat[xrows, cell] + 1)
  data_frame(
    gene = xrows,
    resampled = xmat,
    original = ymat) %>% 
    gather(library, 
           Expression, -gene)
}

plot_histogram <- function(df){
  ggplot(df, 
         aes_string("Expression")) +
    geom_density(aes_string(fill = "library"),
                 alpha = 0.66) +
    scale_fill_viridis_d(name = "") +
    facet_wrap(~cell, nrow = 1) +
    theme(legend.position = "top",
          strip.text = element_text(size = 8)) 
}

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_histogram(expr_dat)
expressed_in_resampled_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_histogram(expr_dat)
share_genes_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- plot_histogram(expr_dat)


plts <- list(
  expressed_in_resampled_plt,
  share_genes_plt,
  new_gene_plt
)

plt <- plot_grid(plotlist = plts, ncol = 1)
plt

save_plot("expression_histograms.pdf", plt, 
          ncol = 1, nrow = 3,
          base_height = 4,
          base_aspect_ratio = 2)

scatterplots

get_paired_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  xrows <- rownames(xmat)
  xmat <- log2(xmat[, cell] + 1)
  ymat <- log2(ymat[xrows, cell] + 1)
  data_frame(
    gene = xrows,
    resampled = xmat,
    original = ymat)
}

plot_scatter <- function(df){
  
  ggplot(df, 
         aes_string("original", "resampled")) +
    geom_point(size = 0.5) + 
    geom_abline(aes(slope = 1, intercept = 0)) +
    facet_wrap(~cell, nrow = 1) +
    expand_limits(x = 0, y = 0) +
    coord_fixed() + 
    scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
    scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) + 
    theme(legend.position = "top",
          strip.text = element_text(size = 8)) 
}

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_scatter(expr_dat)
expressed_in_resampled_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_scatter(expr_dat)
share_genes_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- ggplot(expr_dat, aes(cell,resampled)) +
  geom_jitter(alpha = 0.55) +
  geom_violin(aes(fill = cell)) +
  ylim(0, max(expr_dat$resampled) * 1.10) + 
  scale_fill_brewer(palette = "Set1") +
    theme(axis.text.x = element_text(angle = 90, 
                                     hjust = 1, vjust = 0.5),
          legend.pos = "none",
          axis.title.x = element_blank()) 

new_gene_plt

plts <- list(
  expressed_in_resampled_plt,
  share_genes_plt
)

plt <- plot_grid(plotlist = plts, ncol = 1)
plt

save_plot("expression_scatterplots.pdf", plt, 
          ncol = 1, nrow = 3,
          base_height = 4,
          base_aspect_ratio = 2)

save_plot("expression_newgenes_violinplots.pdf", new_gene_plt, 
          base_height = 6,
          base_aspect_ratio = 0.5)

Parse out new versus previously identified UMIs

compare_umis <- function(path_to_ctrl,
                         path_to_test,
                         return_summary = F){

  ## umi seqs should be produced by ./get_molecule_info
  ctrl_umi_seqs <- read_tsv(path_to_ctrl,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
  filter(barcode_10x != "Cell_unmatched")

  test_umi_seqs <- read_tsv(path_to_test,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
  filter(barcode_10x != "Cell_unmatched")

  umi_seqs <- full_join(ctrl_umi_seqs, 
          test_umi_seqs, 
          by = c("barcode_10x", "umi_molecule"))
  
  if (return_summary) {
    umi_seqs %>% 
      mutate(new_umi = ifelse(is.na(count.x) & !is.na(count.y), 
                          1L, 
                          0L),
         not_detected_umi = ifelse(!is.na(count.x) & is.na(count.y),
                                   1L,
                                   0L),
         shared_umi = ifelse(!is.na(count.x) & !is.na(count.y),
                             1L,
                             0L)) %>% 
      group_by(barcode_10x) %>% 
      summarize(new_umis = sum(new_umi),
            not_detected_umis = sum(not_detected_umi),
            shared_umis = sum(shared_umi))
  } else {
    umi_seqs
  }
}

umi_files <- file.path(data_dir, libs, "umis", "umigroups.txt.gz")

umi_summaries <- map(umi_files[2],
                   ~compare_umis(umi_files[1], .x, return_summary = T))

names(umi_summaries) <- umi_files[2] %>% 
  str_split(., "/") %>% 
  map_chr(~.x[7])

umi_summary <- bind_rows(umi_summaries, .id = "library")

umis_recovered <- umi_summary %>% 
  gather(class, count, -barcode_10x, -library) 

## annotate with resampled or not

cell_annot <- data_frame(barcode_10x = c(cells[1], cells),
                         library = libs,
                         resampled = T) %>% 
  unnest()

umis_recovered <- umis_recovered %>% 
  mutate(resampled = ifelse(str_replace(barcode_10x, "-1", "") %in% unlist(cells),
                             T,
                             F)) %>% 
  arrange(resampled)

plt <- ggplot(umis_recovered, 
       aes(barcode_10x, count)) +
  geom_point(aes(colour = resampled),
             size = 0.6,
             alpha = 0.75) +
  facet_grid(library ~ class) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        strip.text.y = element_text(size = 12,
                                    margin = margin(0,0.2,0,0.2, "cm"))) +
  scale_color_manual(values = color_palette)

plt 

umi_seqs <- map(umi_files[2],
                ~compare_umis(umi_files[1], .x, return_summary = F))

names(umi_seqs) <- umi_files[2] %>% 
  str_split(., "/") %>% 
  map_chr(~.x[7])

new_umis <- map(umi_seqs, 
    ~filter(.x, 
       str_replace(barcode_10x, "-1", "") %in% unlist(cells), 
       !is.na(count.y), 
       is.na(count.x))  %>% 
  separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>% 
    select(-starts_with("count"))) 

old_umis <- map(umi_seqs, 
    ~filter(.x, 
       str_replace(barcode_10x, "-1", "") %in% unlist(cells), 
       !is.na(count.x),
       !is.na(count.y))  %>% 
  separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>% 
    select(-starts_with("count"))) 


umi_edit_dist <- map2(new_umis,
                      old_umis,
                     ~left_join(.x, .y, 
               by = c("barcode_10x", "genome", "gene")) %>% 
  na.omit() %>% 
  mutate(ed = kentr::get_hamming_pairs(seq.x, seq.y)) %>% 
  group_by(barcode_10x, seq.y, genome, gene) %>% 
  summarize(min_ed = as.integer(min(ed))) %>% 
  ungroup())

umi_edit_dist <- bind_rows(umi_edit_dist, 
                           .id = "library")

umi_edit_dist <-  mutate(umi_edit_dist, 
                          barcode_10x = factor(barcode_10x, 
                              levels = str_c(cells[[resampled_lib]], "-1")))

plt <- ggplot(umi_edit_dist, aes(barcode_10x, 
                          min_ed)) + 
  geom_boxplot(coef = Inf) +
  scale_x_discrete(labels = cell_names) +
  facet_wrap(~library, scales = "free_x", 
             labeller = labeller(library = lib_names)) +
  scale_y_continuous(breaks= scales::pretty_breaks()) +
  labs(y = "Minimum edit distance\noriginal vs. new UMIs") +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1, 
                                   vjust = 0.5),
        axis.title.x = element_blank())

save_plot("umi_edit_distance.pdf", plt)
rm(umi_seqs)

tSNE analysis

original library tSNE

library(Seurat)

mat <- sc_objs[[reflib]]$umi_matrix
sobj <- CreateSeuratObject(mat)
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj)
sobj <- FindVariableGenes(sobj, do.plot = F, y.cutoff = 0.33)
sobj <- RunPCA(sobj, pc.genes = sobj@var.genes, 
               pcs.compute = 20, 
               do.print = F, seed.use = 20180521)
sobj <- RunTSNE(sobj, dims.use = 1:7, seed.use = 20180521)
sobj <- FindClusters(sobj,
                     dims.use = 1:7, 
                     resolution = 0.6, 
                     print.output = F, 
                     random.seed = 20180521)
plt <- TSNEPlot(sobj, 
                colors.use = c(brewer.pal(12, "Paired"), 
                               brewer.pal(9, "Set1")),
                do.label = T) + 
  labs(title = "NIH3T3 and 293T") +
  theme(legend.position = "none")

plt

save_plot("original_cells_tsne.pdf", plt, 
          base_height = 4.25, base_width = 4.25)

plts <- FeaturePlot(sobj, c("mm38::Malat1", "hg38::MALAT1"), do.return = T)

plt <- plot_grid(plotlist = plts, nrow = 1)
save_plot("original_cells_mouse_human_markers.pdf", plt, 
          base_height = 4.25, base_width = 8.5)

original library tSNE supplemented with resampled barcodes

mat <- sc_objs[[reflib]]$umi_matrix

resampled_ids <- sc_objs[[resampled_lib]]$meta_dat %>% 
  filter(resampled) %>% 
  pull(cell_id)

resampled_mat <- sc_objs[[resampled_lib]]$umi_matrix[, resampled_ids]
colnames(resampled_mat) <- str_c(colnames(resampled_mat), 
                                  "::",
                                  "resampled")

mat <- as.data.frame(as.matrix(mat)) %>%
  rownames_to_column("gene")
resampled_mat <- as.data.frame(as.matrix(resampled_mat)) %>%
  rownames_to_column("gene")

combined_mats <- left_join(mat, resampled_mat, by = c("gene")) 
combined_mats <- as.data.frame(combined_mats) %>% 
  column_to_rownames("gene") %>% 
  as.matrix() %>% 
  as(., "sparseMatrix")   

combined_mats[is.na(combined_mats)] <- 0

sobj <- CreateSeuratObject(combined_mats)

new_ids <- sobj@meta.data %>% 
  rownames_to_column("cell") %>% 
  mutate(resampled = ifelse(str_detect(cell, "resampled"),
                             "resampled",
                             "not resampled"))

resampled_cell_ids <- new_ids[new_ids$resampled == "resampled", "cell"] %>% 
  str_replace("::resampled", "")
 
new_ids <- mutate(new_ids, 
                  resampled = ifelse(cell %in% resampled_cell_ids, 
                                      "original cell",
                                      resampled)) %>% 
  select(cell, resampled) %>% 
  as.data.frame(.) %>% 
  column_to_rownames("cell")

sobj <- AddMetaData(sobj, new_ids)
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj)
sobj <- FindVariableGenes(sobj, do.plot = T, y.cutoff = 1)

sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data), 
               pcs.compute = 20, 
               do.print = F, seed.use = 20180522)
sobj <- RunTSNE(sobj, dims.use = 1:7, seed.use = 20180522)
sobj <- FindClusters(sobj, 
                     dims.use = 1:7, 
                     resolution = 0.6, 
                     print.output = F, 
                     random.seed = 20180522)
mdata <- sc_objs$mouse_human_cell_pulldown$meta_dat %>% 
  select(cell_id, resampled, matches("purity_umis"))

mdata_sub <- filter(mdata, resampled) %>% 
  select(cell_id, resampled, proportion_human = purity_umis) %>% 
  mutate(cell_id = str_c(cell_id, "::resampled"),
         proportion_mouse = 1 - proportion_human) 

mdata_ogsub <- filter(mdata, resampled) %>% 
  select(cell_id, resampled, proportion_human = og_purity_umis) %>% 
  mutate(proportion_mouse = 1 - proportion_human) 

mdata_not_sub <- filter(mdata, !resampled) %>% 
  select(cell_id, resampled, proportion_human = og_purity_umis) %>% 
  mutate(proportion_mouse = 1 - proportion_human)

mdata <- bind_rows(list(mdata_sub, mdata_ogsub, mdata_not_sub)) %>% 
  unique() %>% 
  mutate(species = ifelse(proportion_human > 0.90,
                          "Human",
                          ifelse(proportion_mouse > 0.90,
                                 "Mouse",
                                 "Doublet"))) %>% 
  as.data.frame() %>% 
  tibble::column_to_rownames("cell_id") %>% 
  select(-resampled) 


sobj <- AddMetaData(sobj, mdata)
sobj <- SetAllIdent(sobj, "species")

tsne_dat <- GetCellEmbeddings(sobj, reduction.type = "tsne") %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column("cell")

mdata <- sobj@meta.data %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column("cell")

tsne_dat <- left_join(tsne_dat, 
                         mdata,
                         by = "cell")
plt <- ggplot(tsne_dat, 
              aes(tSNE_1, tSNE_2)) + 
  geom_point(aes(color = species), size = 0.1) + 
  scale_color_manual(values = brewer.pal(3, "Paired"),
                     name = "") +
  labs(title = "",
       x = "tSNE 1",
       y = "tSNE 2") +
  theme_cowplot() + 
  guides(color = guide_legend(nrow = 3, 
                              ncol = 1,
                              override.aes = list(size = 4))) + 
  theme(legend.position = c(1.05, 1.05),
        legend.justification = c("right", "top"),
        legend.box.just = "right")  

save_plot("original_resampled_cells_mouse_human.pdf", plt, 
          base_height = 4.25, base_width = 8.5)


og_cell_dat <- tsne_dat %>% 
  filter(cell %in% str_c(cells[[resampled_lib]], "-1")) 

resampled_cell_dat <- tsne_dat %>% 
  filter(cell %in% str_c(cells[[resampled_lib]], 
                                 "-1::resampled")) %>% 
  mutate(cell = str_replace(cell, "::resampled", "")) %>% 
  select(cell, tSNE_1, tSNE_2)

cell_dat <- left_join(og_cell_dat, 
                      resampled_cell_dat, 
                      by = "cell", 
                      suffix = c("", "_resampled"))


resampled_cells <- ggplot(tsne_dat, 
              aes(tSNE_1, tSNE_2)) + 
  geom_point(aes(color = resampled), size = 0.1) + 
  scale_color_manual(values = c("lightgrey",
                                brewer.pal(7, "Set1")[1:2]),
                     name = "",
                     labels = c("not resampled" = "Not Resampled",
                                "original cell" = "Original Cell",
                                "resampled" = "Resampled Cell")) +
  geom_segment(data = cell_dat, 
                               aes(x = tSNE_1,
                                   y = tSNE_2, 
                                   xend = tSNE_1_resampled,
                                   yend = tSNE_2_resampled),
                               linejoin = "mitre",
                               arrow = arrow(length = unit(0.03, "npc"))) + 
  labs(title = "",
       x = "tSNE 1",
       y = "tSNE 2") +
  theme_cowplot() + 
  guides(color = guide_legend(nrow = 3, 
                              ncol = 1,
                              override.aes = list(size = 4))) + 
  theme(legend.position = c(1.05, 1.05),
        legend.justification = c("right", "top"),
        legend.box.just = "right")  

plt <- plot_grid(plt, 
                 resampled_cells,
                 ncol = 2)
plt

save_plot("resampled_tsne.pdf", plt, 
          nrow = 1, ncol = 2,
          base_height = 5.5, base_width = 4.25)
saveRDS(sobj, "rs_sobj.rds")

kNN analysis

Find the k-nearest neighbors in PCA space

## use combined data from above
data.use <- GetCellEmbeddings(object = sobj,
                              reduction.type = "pca",
                              dims.use = 1:7)

## findnearest neighboors using exact search
knn <- RANN::nn2(data.use, k = 5,
                 searchtype = 'standard',
                 eps = 0)

resampled_idxs <- knn$nn.idx[str_detect(rownames(data.use), "::resampled"), ]

nn_ids <- as_data_frame(t(apply(resampled_idxs, 1,
                      function(x)rownames(data.use)[x])))

colnames(nn_ids) <- c("query_cell", 
                      paste0("nearest neighbor ", 1:(ncol(nn_ids) - 1)))

nn_ids

Species specificity

resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = unlist(libs))) %>% 
  arrange(resampled)

cell_names_df <- data_frame(old_id = names(cell_names), 
                                  new_id = cell_names) 

plt_dat <- resampled_metadat %>% 
  filter(resampled, library != reflib) %>% 
  left_join(.,
            cell_names_df, by = c("cell_id" = "old_id")) %>% 
  select(new_id, library, contains("purity")) %>% 
  gather(type, purity, -new_id, -library) %>% 
  mutate(type = ifelse(str_detect(type, "^og"),
                       str_replace(type, "^og_", "original_"),
                       str_c("resampled_", type)),
         library = factor(library, levels = resampled_libs)) %>% 
  separate(type, c("resampled", "not_import", "count_type"), sep = "_") %>% 
  select(-not_import)

plt <- ggplot(plt_dat, aes(new_id, purity)) +
  geom_hline(aes(yintercept = 0.80), linetype = "dashed", 
             color = "grey", alpha = 0.75) +
  geom_hline(aes(yintercept = 0.20),  linetype = "dashed", 
             color = "grey", alpha = 0.75) +
  geom_point(aes(color = resampled), 
             position = position_dodge(width = 0.5)) +
  facet_wrap(~count_type, 
             labeller = labeller(count_type = c(
               "reads" = "Read Count",
               "umis" = "UMI Count"
             )),
             scales = "free_x") +
  scale_color_brewer(palette = "Set1", name = "") +
  guides(color = guide_legend(override.aes = list(size = 5))) +
  labs(x = "",
       y = expression(paste( "Species Purity ", 
                             frac("Human", "Human + Mouse")))) +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1,
                                   vjust = 0.5),
        legend.pos = "top")

save_plot("species_specificity.pdf", plt,
          base_height = 5, base_aspect_ratio = 1)